clear all
set more off

capture log close
log using fpr_calc.log, replace

cd "${path}build"

use vin station_id overall ovral* teststart odometer insp_reasn test_type hc_mode* co_mode* no_mode* using smog 


keep if overall == "P" | overall == "F" | overall == "G"

drop if year(dofc(teststart)) >2012 | year(dofc(teststart))<1996


forvalues i = 1/2{
    replace no_mode`i' = . if test_type == "T"
}

foreach pollutant in hc no{
    forvalues i = 1/2{
        replace `pollutant'_mode`i' = 1 if `pollutant'_mode`i' <= 0
    }

}
forvalues i = 1/2{
    replace co_mode`i' = .01 if co_mode`i' <=0
}

foreach pollutant in hc no co{
	gen `pollutant' = ln((`pollutant'_mode1 + `pollutant'_mode2)/2)
	drop `pollutant'_mode*
}

/*Drop singletons; they won't be useful*/
bysort vin: drop if _N ==1

/*Also drop vehicles with more than one observation at the same time*/
bysort vin teststart: gen byte doubles = _N>1
by vin: gegen byte somedouble = max(doubles)
drop if somedouble
drop doubles somedouble


drop if overall == "T" | overall == "A"

/*Decode and organize older cars*/

gen prefix = substr(vin,1,8) + substr(vin,10,2) if length(vin) == 17
preserve
keep if length(vin) == 17
tempfile newvins
save `newvins'
restore
keep if length(vin) <17


gen qprefix = substr(vin,1,length(vin)-6) 
replace qprefix = substr(vin,1,length(vin)-5) if regexm(vin,"^(HL530|HLS30|PL620)") & length(vin)==10
replace qprefix = substr(vin,1,length(vin)-5) if regexm(vin,"^(116|126|128|196|198|206|208)") & substr(vin,4,1)!="2" & regexm(vin,"[A-Z]")==0 & length(vin)== 10
replace qprefix = substr(vin,1,length(vin)-5) if regexm(vin,"^[A-Z][A-Z][0-9][0-9]") & length(vin) == 9 & regexm(substr(vin,3,.),"[A-Z]")==0 & substr(vin,1,2) != "AR"
replace qprefix = substr(vin,1,length(vin)-5) if regexm(vin,"^(BNA61|M10A1|SN3A1|STCV2)") & length(vin) == 10
replace qprefix = substr(vin,1,length(vin)-4) if regexm(vin,"^(SVAV1|STC35)") & length(vin) == 9
replace qprefix = substr(vin,1,length(vin)-4) if length(vin) == 7 & regexm(vin,"[A-Z]") == 0

gen make_str = "OTH"
gen oldmake = "OTHER"

do modelchar

replace prefix = qprefix + "I" if length(qprefix) == 9 & modelchar == "" 
replace prefix = qprefix + "I" + string(length) if length(qprefix) == 7 & modelchar == ""
replace prefix = qprefix + "II" + string(length) if length(qprefix) == 6 & modelchar == ""
replace prefix = qprefix + "III" + string(length) if length(qprefix) == 5 & modelchar == ""
replace prefix = qprefix + "IIII" + string(length) if length(qprefix) == 4 & modelchar == ""
replace prefix = qprefix + "IIIII" + string(length) if length(qprefix) == 3 & modelchar == ""


replace prefix = qprefix + "I" + modelchar+ "I" + string(length) if length(qprefix) >= 5 &modelchar !=""
replace prefix = qprefix + "II" + modelchar+ "I" + string(length) if length(qprefix) == 4 &modelchar !=""
replace prefix = qprefix + "III" + modelchar+ "I" + string(length) if length(qprefix) == 3 &modelchar !=""

drop modelchar length serial qprefix
drop if prefix == ""

append using `newvins'


gen date = dofc(teststart)
gen year = year(date)


/* Now, to the extent possible, correct mileage for errors and rollovers*/


/*We will set the data up as a panel and count the hundred thousand mile marks*/
/*Get some temparary variables*/
tempvar time id hundreds maxtime stickoutflag

/*Vin must be coded numeric to use it in an xtset*/
gegen double `id' = group(vin)

/*Order by time, and use the order for the time variable*/
sort `id' teststart odometer

by `id': gen byte `time' = _n

/*If there are more than 39 smog checks for one vin, drop it--something is surely wrong*/
by `id': gegen `maxtime' = max(`time')

drop if `maxtime' > 39
drop `maxtime'




xtset `id' `time'


/*We need the highest period for the loop*/
qui summ `time'

local maxtime = r(max)



/*Initialize the estimated mileage and hundreds counter*/
gen estmiles = odometer


by `id':gegen multizero = max(odometer == 0 & (L1.odometer == 0 | F1.odometer == 0))
drop if multizero
drop multizero

/*And drop smog checks with a zero odometer reading*/
drop if odometer == 0

/*Likewise, observations greater than or near 1 million make no sense*/

drop if odometer == 999999

replace estmiles = round(estmiles/100) if estmiles > 1000000 & (L1.estmiles<100000|F1.estmiles <100000)
replace estmiles = round(estmiles/10) if estmiles > 1000000 & (L1.estmiles>=100000|F1.estmiles>=100000)
replace estmiles = round(estmiles/10) if estmiles >500000
replace estmiles = round(estmiles/10) if estmiles > 200000 & (floor(estmiles/100000) == floor(L1.estmiles/10000)|floor(estmiles/100000) == floor(F1.estmiles/10000)|(estmiles/10>L1.estmiles & estmiles/10<F1.estmiles))

 
/* Identify observations that "stick out" and correct them*/

/*Flag observations*/

gen byte `stickoutflag' = (mod(L1.estmiles,100000) <= mod(F1.estmiles,100000) & F1.estmiles !=.) & (mod(estmiles,100000) > mod(F1.estmiles,100000) | mod(estmiles,100000) < mod(L1.estmiles,100000) | (L1.estmiles > 100000 & F1.estmiles > 100000 & (estmiles < L1.estmiles | estmiles > F1.estmiles))) 



/*Fix isolated observations -- cases where there are two in a row are more complicated*/

/*1: Use previous observation if within a few days*/


replace estmiles = L1.estmiles if date - L1.date < 20 & `stickoutflag' == 1 
 
replace `stickoutflag' = (mod(L1.estmiles,100000) <= mod(F1.estmiles,100000) & F1.estmiles !=.) & (mod(estmiles,100000) > mod(F1.estmiles,100000) | mod(estmiles,100000) < mod(L1.estmiles,100000) | (L1.estmiles > 100000 & F1.estmiles > 100000 & (estmiles < L1.estmiles | estmiles > F1.estmiles))) 

/*2: Try dividing and multiplying by 10 */

replace estmiles = round(estmiles/10) if `stickoutflag' == 1  & mod(L1.estmiles,100000) < mod(estmiles/10,100000) & estmiles/10<mod(F1.estmiles,100000) & estmiles > 200000 & L1.estmiles < 200000
replace estmiles = round(estmiles/10) if `stickoutflag' == 1  & mod(L1.estmiles,100000) < mod(estmiles/10,100000) & estmiles/10<mod(F1.estmiles,100000) & L1.estmiles < 20000 & estmiles < 200000 & estmiles >=100000


replace estmiles = estmiles*10 if `stickoutflag' == 1  & mod(L1.estmiles,100000) < mod(estmiles*10,100000) & estmiles*10<mod(F1.estmiles,100000) & estmiles < 200000

replace estmiles = round(estmiles/10) if mod(L1.estmiles,100000) > 85000 & estmiles/10<mod(F1.estmiles,100000)  & mod(F1.estmiles,100000) < 10000 & mod(estmiles,100000) < mod(L1.estmiles,100000) & estmiles > 10000 & L1.estmiles !=. & F1.estmiles !=.

replace `stickoutflag'= (mod(L1.estmiles,100000) <= mod(F1.estmiles,100000) & F1.estmiles !=.) & (mod(estmiles,100000) > mod(F1.estmiles,100000) | mod(estmiles,100000) < mod(L1.estmiles,100000) | (L1.estmiles > 100000 & F1.estmiles > 100000 & (estmiles < L1.estmiles | estmiles > F1.estmiles))) 

/*3: Try adjusting potential number switches*/

/*6 for 9*/
replace estmiles =floor(estmiles/100000)*100000+ 90000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==6 & floor(estmiles/100000)*100000+90000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+floor(estmiles/100000)*100000+90000 + mod(estmiles,10000) < F1.estmiles
/*9 for 6 */
replace estmiles =floor(estmiles/100000)*100000+ 60000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==9 & floor(estmiles/100000)*100000+60000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+60000 + mod(estmiles,10000) < F1.estmiles
/*9 for 7 */
replace estmiles =floor(estmiles/100000)*100000+ 70000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==9 & floor(estmiles/100000)*100000+60000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+60000 + mod(estmiles,10000) < F1.estmiles
/*7 for 9 */
replace estmiles =floor(estmiles/100000)*100000+ 90000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==7 & floor(estmiles/100000)*100000+70000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+70000 + mod(estmiles,10000) < F1.estmiles
/*8 for 3 */
replace estmiles =floor(estmiles/100000)*100000+ 30000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==8 & floor(estmiles/100000)*100000+30000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+30000 + mod(estmiles,10000) < F1.estmiles
/*3 for 8 */
replace estmiles =floor(estmiles/100000)*100000+ 80000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==3 & floor(estmiles/100000)*100000+80000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+80000 + mod(estmiles,10000) < F1.estmiles
/*4 for 9 */
replace estmiles =floor(estmiles/100000)*100000+ 90000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==4 & floor(estmiles/100000)*100000+90000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+90000 + mod(estmiles,10000) < F1.estmiles
/*9 for 4 */
replace estmiles =floor(estmiles/100000)*100000+ 40000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==9 & floor(estmiles/100000)*100000+40000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  floor(estmiles/100000)*100000+40000 + mod(estmiles,10000) < F1.estmiles

/*4: Linearly impute remaining ridiculous values*/

replace `stickoutflag' = (mod(L1.estmiles,100000) <= mod(F1.estmiles,100000) & F1.estmiles !=.) & (mod(estmiles,100000) > mod(F1.estmiles,100000) | mod(estmiles,100000) < mod(L1.estmiles,100000) | (L1.estmiles > 100000 & F1.estmiles > 100000 & (estmiles < L1.estmiles | estmiles > F1.estmiles))) 
replace estmiles = round(L1.estmiles + (date - L1.date)*(mod(F1.estmiles,100000) - mod(L1.estmiles,100000))/(F1.date - L1.date+1)) if `stickoutflag'==1 & (mod(F1.estmiles,100000) - mod(L1.estmiles,100000))/(F1.date - L1.date+1) >=0 & (mod(F1.estmiles,100000) - mod(L1.estmiles,100000))/(F1.date - L1.date+1) <200 



/*Deal with things that stick out on the end*/
replace `stickoutflag' = F1.estmiles == . & L1.estmiles !=. & mod(estmiles,100000)< mod(L1.estmiles,100000) & (mod(estmiles,100000) + 100000 - mod(L1.estmiles,100000))/(date-L1.date) > 100




replace estmiles = L1.estmiles if date - L1.date < 20 & `stickoutflag' == 1 
 
replace `stickoutflag' = F1.estmiles == . & L1.estmiles !=. & mod(estmiles,100000)< mod(L1.estmiles,100000) & (mod(estmiles,100000) + 100000 - mod(L1.estmiles,100000))/(date-L1.date) > 100

/*2: Try dividing and multiplying by 10 */

replace estmiles = estmiles/10 if `stickoutflag' == 1  & mod(L1.estmiles,100000) < mod(estmiles/10,100000) & estmiles/10<mod(F1.estmiles,100000) & estmiles > 200000
replace estmiles = estmiles*10 if `stickoutflag' == 1  & mod(L1.estmiles,100000) < mod(estmiles*10,100000) & (estmiles*10 -L1.estmiles)/(date-L1.date) < 200 & estmiles < 200000

replace `stickoutflag' = F1.estmiles == . & L1.estmiles !=. & mod(estmiles,100000)< mod(L1.estmiles,100000) & (mod(estmiles,100000) + 100000 - mod(L1.estmiles,100000))/(date-L1.date) > 100

/*3: Try adjusting potential number switches*/

/*6 for 9*/
replace estmiles = 90000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==6 & 90000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  90000 + mod(estmiles,10000) < F1.estmiles
/*9 for 6 */
replace estmiles = 60000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==9 & 60000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  60000 + mod(estmiles,10000) < F1.estmiles
/*9 for 7 */
replace estmiles = 70000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==9 & 60000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  60000 + mod(estmiles,10000) < F1.estmiles
/*7 for 9 */
replace estmiles = 90000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==7 & 70000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  70000 + mod(estmiles,10000) < F1.estmiles
/*8 for 3 */
replace estmiles = 30000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==8 & 30000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  30000 + mod(estmiles,10000) < F1.estmiles
/*3 for 8 */
replace estmiles = 80000 + mod(estmiles,10000) if `stickoutflag' == 1  & floor(estmiles/10000) ==3 & 80000 + mod(estmiles,10000) > mod(L1.estmiles,100000) &  80000 + mod(estmiles,10000) < F1.estmiles

/*4: Linearly impute remaining ridiculous values*/

replace `stickoutflag' = F1.estmiles == . & L1.estmiles !=. & mod(estmiles,100000)< mod(L1.estmiles,100000) & (mod(estmiles,100000) + 100000 - mod(L1.estmiles,100000))/(date-L1.date) > 100
replace estmiles = round(L1.estmiles + (date - L1.date)*(mod(L1.estmiles,100000) - mod(L2.estmiles,100000))/(L1.date - L2.date+1)) if `stickoutflag'==1 & L2.date !=.
replace estmiles = L1.estmiles if `stickoutflag' == 1 & L2.date == . 



/*There are instances where an extra decimal place was added incorrectly at the end*/

replace estmiles = round(estmiles/10) if L1.estmiles < 100000 & estmiles >= L1.estmiles*10 & estmiles > 200000


/* There are also cases where there are two smog checks on the same day or within a few days 
	of each other with drastically different odometer readings.  Take the first reading in these cases */

replace estmiles = L1.estmiles if date- L1.date < 20

//replace odometer = L1.odometer if date-L1.date < 10 & L1.odometer - odometer < 0 & L1.odometer - odometer >-15000


gen byte `hundreds'= floor(estmiles/100000) if `time'==1
disp "Starting `maxtime' iterations"
/*Now for each "period", up the hundreds if there was a rollover, and adjust the estimated mileage accordingly*/
forvalues period = 2/`maxtime'{
	if mod(`period',5) == 0 disp `period'
	qui replace `hundreds' = max(L1.`hundreds',floor(estmiles/100000)) if `time' == `period'
	qui replace `hundreds' = `hundreds' + 1 if `time'==`period' ///
					& (mod(L1.estmiles,100000)>mod(estmiles,100000) & estmiles<L1.estmiles) ///
					& ((estmiles+100000 -L1.estmiles)/(date-L1.date)<200) 
	qui replace estmiles = `hundreds'*100000 + mod(estmiles,100000) if `time' == `period'
}

/* Now go backwards and correct rollovers which happen before we observed the car (i.e., 
	where we see 80000, followed by 181000, and don't believe that the car could actually
	have travelled 101,000 miles. */


forvalues period = `maxtime'(-1)1{
	if `period' == `maxtime' continue
	if mod(`period',5) == 0 disp `period'
	qui replace `hundreds' = floor(F1.estmiles/100000) if `time' == `period' ///
					& (mod(F1.estmiles,100000)>=mod(estmiles,100000) & F1.estmiles - estmiles >=100000 & F1.estmiles !=.) ///
					& (F1.date-date < 1500)

	qui replace estmiles = `hundreds'*100000 + mod(estmiles,100000) if `time'==`period'
	qui replace `hundreds' = F1.`hundreds' - 1 if `time' == `period' ///
					& (F1.estmiles - estmiles >= 100000) & F1.estmiles >=200000 & F1.date-date<1500
	qui replace estmiles = `hundreds'*100000 + mod(estmiles,100000) if `time'==`period'
}

gen mpd = (estmiles - L1.estmiles)/(date - L1.date + 1)

tempvar dropflag count

by `id': gegen dropflag = max(estmiles > 800000 | mpd <0 | (mpd>200 & mpd < .))



/*Define the initial inspection of each cycle*/

gen int timesince = date - L1.date

gen byte first = `time' == 1

by `id': gen byte initial_inspection = (date - L1.date >60  & overall[_n-1] !="F" & overall[_n-1]!="G") | _n==1

/*While this is still sorted by vin and time, get the station which last inspected and when it was, so that later
we can link this back for the follow-up pass rate*/

gen long laststation = L1.station_id if  initial_inspection == 1
gen lastdate = L1.date if initial_inspection == 1

drop __0*  odometer

/*Define the "cycle period" for purposes of the average failure rate*/

replace prefix = substr(vin,1,8) + substr(vin,10,1) if length(vin)==17
gen int quarter = qofd(date)
gen int half = hofd(date)
gen int lastquarter = qofd(lastdate)
gen int lasthalf = hofd(lastdate)

gen fail = overall == "F" | overall == "G"

/*Calculate average failure rate by vehicle type in current cycle*/

bysort prefix quarter: gegen failrate_av = mean(fail) if initial_inspection == 1

/*This will require variables for make and model year*/
replace make_str = substr(vin,1,3) if length(vin) == 17
gegen make = group(prefix)
gen modelchar = substr(vin,10,1)
gen modelyr = .
do modelchar_all

sort vin date teststart

replace timesince = 730 if first

/*Now regress failure dummy on fixed effects and characteristics we can control for*/
xtset make

xtreg fail estmiles timesince i.quarter if initial_inspection==1 & dropflag==0, fe

/*Get the means of mileage and time since last inspection*/
foreach var in estmiles timesince{
    bysort quarter modelyr prefix: gegen `var'_av = mean(`var') if initial_inspection==1 & dropflag==0
}

/*Expected failure rate is the actual failure rate by prefix, plus the change in failure probability due to 
mileage or time since last inspection which is above or below the mean*/
gen failrate_expected = max(failrate_av + (estmiles-estmiles_av)*_b[estmiles]*(dropflag==0) + (timesince-timesince_av)*_b[timesince],0)



/*Calculate actual failure rate by station each quarter*/
gen failrate = overall=="F"|overall=="G" if initial_inspection == 1

/*Then the average to compare to is the average expected failure rate of the cars they inspected*/
bysort quarter station_id: gegen failrate_average = mean(failrate_expected) if initial_inspection == 1

compress

save smog_precollapse, replace

use smog_precollapse, replace

/*Generate a couple of new variables so that we can describe the profile of vehicles being inspected at each station-quarter*/

gen asm = test_type == "A"

gen directed = inlist(insp_reasn,"D","S","P")


gen tenplus = year-modelyr>=10 






/*Now we will take the actual and expected pass rates by station, and turn them into the STAR quality measures*/

/*First is the "similar vehicle pass rate" (SVFR).  This is simply the ratio of your actual failure rate to the expected
failure rate based on the types of vehicles being inspected.  Stations must have a ratio of 75% (fail vehicles 75% as
often as the average station) to qualify for STAR)*/




gcollapse failrate failrate_expected (count) N = failrate_av (sum) asm directed tenplus if initial_inspection == 1 ,  by(station quarter)

gen svfr_ratio = failrate/failrate_expected

tempfile temp
save `temp'


use smog_precollapse



/*Next is the "follow up pass rate" (FPR).  Here, we compare actual failure rates to expected failure rates, but assign the
result to the previous station doing the inspection.  The score is the t-statistic of the following one tailed hypothesis test:

H0: Todays failure rate is less than or equal to the expected failure rate
Ha: Today's failure rate is greater than the expected failure rate

Stations must have a score of 0.4 or better to qualify (strictly speaking, lowest ranked inspector must have 0.4 or better)*/

gcollapse failrate failrate_expected (sd) failrate_sd = failrate failrate_expected_sd = failrate_expected (count) N = failrate_av if initial_inspection == 1 & laststation!=., by(laststation half)

gen double fpr_score = ttail(N,(failrate-failrate_expected)/sqrt((failrate_sd + failrate_expected_sd)/N)) if N>30

/*In order to merge this, we need to have one observation per station-quarter, rather than station-half-year*/
expand 2, gen(tag)

gen quarter = qofd(dofh(half)) + tag

rename laststation station_id

keep fpr_score station quarter

joinby station_id quarter using `temp'

save `temp', replace

use smog_precollapse, replace

/*Now do the FPR again, except associate the future FPR with the quarter in which vehicles are being inspected.  This measure is not 
used by STAR*/

gcollapse failrate failrate_expected (sd) failrate_sd = failrate failrate_expected_sd = failrate_expected (count) N = failrate_av if initial_inspection == 1 & laststation!=., by(laststation lasthalf)


gen double fpr_current = ttail(N,(failrate-failrate_expected)/sqrt((failrate_sd + failrate_expected_sd)/N)) if N > 30

/*In order to merge this, we need to have one observation per station-quarter, rather than station-half-year*/
expand 2, gen(tag)

gen quarter = qofd(dofh(lasthalf)) + tag

rename laststation station_id
//rename lastquarter quarter


keep fpr station quarter

joinby station_id quarter using `temp'



mmerge station_id using station_census_geo.dta, unm(master) ukeep(county district zip)

save star_scores, replace

log close
